This notebook reproduces the result in the paper "A Machine Learning Approach to Live Migration Modeling" presented in SoCC'17.
%matplotlib notebook
import statistics
import time
import multiprocessing
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats.mstats import gmean
from matplotlib.ticker import EngFormatter
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import make_pipeline
df = pd.read_csv('dataset/2017.socc.dataset.csv')
# Shuffle rows
df = df.sample(frac=1).reset_index(drop=True)
# Rename column names for readability
df = df.rename(columns={'qemu_tt': 'Total Time', 'qemu_dt': 'Downtime', 'qemu_td': 'Total Data',
'performance': 'Performance', 'used_cpu_src': 'SRC.CPU', 'used_mem_src': 'SRC.MEM'})
# Rename the column values to readable strings
df['capability'] = df['capability'].map({0: 'PRE', 1: 'THR', 2: 'DLTC', 3: 'DTC', 4: 'POST'})
df['workload_type'] = df['workload_type'].map({0: 'idle', 1: 'synthetic', 2: 'specweb', 3: 'oltpbench', 4: 'memcached', 5: 'dacapo', 6: 'parsec', 7: 'bzip', 8: 'mplayer'})
df[:10]
ax = df.groupby('capability')['capability'].count().plot.bar(
title='# Samples for Each Live Migration Technique', edgecolor='k', linewidth=1.0, width=0.5, figsize=(4.5, 3.5))
ax.set_ylim([0, 9100])
for p in ax.patches:
ax.annotate(str(p.get_height()), (p.get_x() + 0.01, p.get_height() + 100))
x_axis = ax.axes.get_xaxis()
x_axis.get_label().set_visible(False)
y_axis = ax.axes.get_yaxis()
y_axis.set_label_text('Count')
plt.tight_layout()
ax = df.groupby('workload_type')['workload_type'].count().plot.bar(
title='# Samples for Each Workload Type', edgecolor='k', linewidth=1.0, width=0.5, figsize=(5.5, 4.5))
ax.set_ylim([0, 20000])
for p in ax.patches:
offset = 0
if p.get_height() < 100:
offset = 0.1
elif p.get_height() < 1000:
offset = -0.02
elif p.get_height() < 10000:
offset = -0.1
elif p.get_height() < 100000:
offset = -0.2
ax.annotate(str(p.get_height()), (p.get_x() + offset, p.get_height()+500))
x_axis = ax.axes.get_xaxis()
x_axis.get_label().set_visible(False)
y_axis = ax.axes.get_yaxis()
y_axis.set_label_text('Samples')
plt.tight_layout()
We use composed features which are generated from the original features to improve the model accuracy. This step is also called "feature engineering" that is commonly deployed in other machine learning problems.
def get_RPTR(row):
RPTR = row['VM_wss'] * (row['VM_pdr'] / row['VM_ptr'])**2
if RPTR >= row['VM_wss']:
RPTR = row['VM_wss']
return RPTR
def get_THR_benefit(row):
return row['VM_pdr'] * min(row['VM_cpu_util'] / 100, 1.0)
def get_DLTC_benefit(row):
v = row['VM_mwpp'] / (4096 / 2)
if v > 1.0:
v = 1.0 - (v - 1.0)
return row['VM_wss'] * v
df['RPTR'] = df.apply(lambda row: get_RPTR(row), axis=1)
df['THR_benefit'] = df.apply(lambda row: get_THR_benefit(row), axis=1)
df['DLTC_benefit'] = df.apply(lambda row: get_DLTC_benefit(row), axis=1)
df['VM_nwss'] = df['VM_size'] - df['VM_wss']
df['VM_e_wss'] = df['VM_wss'] * df['VM_wse']
df['VM_e_nwss'] = df['VM_nwss'] * df['VM_nwse']
fig, ax_arr = plt.subplots(nrows=5, ncols=4, figsize=(9.5, 8.5))
for i, feature in enumerate(['VM_size', 'VM_pdr', 'VM_wss', 'VM_wse',
'VM_nwse', 'VM_mwpp', 'VM_pmu_instr', 'VM_ptr',
'VM_cpu_util', 'VM_net_util', 'src_cpu_avail', 'dst_cpu_avail',
'src_mem_avail', 'dst_mem_avail', 'RPTR', 'THR_benefit',
'DLTC_benefit', 'VM_nwss', 'VM_e_wss', 'VM_e_nwss']):
row_num, col_num = int(i / 4), i % 4
df[feature].plot.hist(title=feature, ax=ax_arr[row_num][col_num], edgecolor='black', linewidth=0.5, bins=30)
y_axis = ax_arr[row_num][col_num].axes.get_yaxis()
formatter = EngFormatter()
y_axis.set_major_formatter(formatter)
y_axis.get_label().set_visible(False)
for tick in ax_arr[row_num][col_num].get_yticklabels():
tick.set_rotation(45)
plt.tight_layout()
units = {'Total Time': '(ms)', 'Downtime': '(ms)', 'Total Data': '(bytes)', 'Performance': '', 'SRC.CPU': '(%)', 'SRC.MEM': '(MB)'}
fig, ax_arr = plt.subplots(nrows=2, ncols=4, figsize=(9.5, 3.5))
for i, feature in enumerate(['Total Time', 'Downtime', 'Total Data', 'Performance', 'SRC.CPU', 'SRC.MEM']):
row_num, col_num = int(i / 4), i % 4
df[feature].plot.hist(title='{} {}'.format(feature, units[feature]),
ax=ax_arr[row_num][col_num], edgecolor='black', linewidth=0.5, bins=30)
x_axis = ax_arr[row_num][col_num].axes.get_xaxis()
formatter = EngFormatter()
x_axis.set_major_formatter(formatter)
y_axis = ax_arr[row_num][col_num].axes.get_yaxis()
formatter = EngFormatter()
y_axis.set_major_formatter(formatter)
y_axis.get_label().set_visible(False)
for tick in ax_arr[row_num][col_num].get_yticklabels():
tick.set_rotation(45)
plt.tight_layout()
We build separate models for each technique and migration metric instead of training one big model. This technique is also called "sub-modeling" which often effectively improves model accuracy by reducing the dimensions of the model.
training_features = ['VM_size', 'VM_pdr', 'VM_wss', 'VM_wse', 'VM_nwse', 'VM_mwpp',
'VM_pmu_instr', 'VM_ptr', 'VM_cpu_util', 'VM_net_util',
'src_cpu_avail', 'dst_cpu_avail', 'src_mem_avail', 'dst_mem_avail',
'RPTR', 'THR_benefit', 'DLTC_benefit', 'VM_nwss', 'VM_e_wss', 'VM_e_nwss']
test_features = ['Total Time', 'Downtime', 'Total Data', 'Performance', 'SRC.CPU', 'SRC.MEM']
dataset = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
dataset[technique] = {}
dataset[technique]['X'] = df[df['capability'] == technique][training_features].copy(deep=True).astype(np.float64)
dataset[technique]['y'] = df[df['capability'] == technique][test_features].copy(deep=True).astype(np.float64)
def get_cv_result(X, y, model='linear', n_splits=10):
cv_result = {'test': [], 'prediction': [], 'training_time': [], 'test_time': []}
for train, test in KFold(n_splits=n_splits).split(X):
X_train, y_train = X.iloc[train,:], y.iloc[train,:]
X_test, y_test = X.iloc[test,:], y.iloc[test,:]
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)
n_jobs = multiprocessing.cpu_count()
if model == 'linear':
regr = MultiOutputRegressor(LinearRegression(), n_jobs=n_jobs)
elif model == 'svr':
regr = MultiOutputRegressor(SVR(C=10.0), n_jobs=n_jobs)
t1 = time.time()
regr.fit(X_scaler.transform(X_train), y_scaler.transform(y_train))
training_time = time.time() - t1
cv_result['training_time'].append((len(X_train), training_time))
t1 = time.time()
prediction = regr.predict(X_scaler.transform(X_test))
prediction = y_scaler.inverse_transform(prediction)
test_time = time.time() - t1
cv_result['test_time'].append((len(X_test), test_time))
for row in y_test.values:
cv_result['test'].append(dict(zip(test_features, row)))
for row in prediction:
cv_result['prediction'].append(dict(zip(test_features, row)))
return cv_result
total_result = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
print(technique)
cv_result = get_cv_result(dataset[technique]['X'], dataset[technique]['y'], model='svr', n_splits=10)
total_result[technique] = cv_result
error_result = {}
for technique in total_result:
n = len(total_result[technique]['test'])
error_result[technique] = {'abs_err': [], 'rel_err': []}
for i in range(n):
test = total_result[technique]['test'][i]
prediction = total_result[technique]['prediction'][i]
row_abs_err = {}
row_rel_err = {}
to_skip = False
for metric in test:
if test[metric] == 0.0:
continue
abs_err = abs(prediction[metric] - test[metric])
rel_err = (prediction[metric] - test[metric]) / test[metric]
row_abs_err[metric] = abs_err
row_rel_err[metric] = rel_err
error_result[technique]['abs_err'].append(row_abs_err)
error_result[technique]['rel_err'].append(row_rel_err)
use_geomean = True
df_error = []
for feature in test_features:
data = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
data[technique] = list(pd.DataFrame(error_result[technique]['abs_err'])[feature].values)
if use_geomean:
df_tmp = pd.DataFrame(data[technique], columns=['err']).dropna()
df_error.append((feature, technique, gmean(abs(df_tmp['err']))))
else:
df_error.append((feature, technique, pd.DataFrame(data[technique]).mean().values[0]))
df_error = pd.DataFrame(df_error, columns=['Metric', 'Technique', 'Abs.Error'])
units = {'Total Time': '(ms)', 'Downtime': '(ms)', 'Total Data': '(bytes)', 'Performance': '', 'SRC.CPU': '(%)', 'SRC.MEM': '(MB)'}
fig, ax_arr = plt.subplots(nrows=2, ncols=3, figsize=(9.5, 5))
for i, feature in enumerate(test_features):
row_num, col_num = int(i / 3), i % 3
df_error[df_error['Metric'] == feature].plot.bar(
title=feature, x='Technique', y='Abs.Error', edgecolor='k', linewidth=1.0,
legend=False, ax=ax_arr[row_num][col_num])
x_axis = ax_arr[row_num][col_num].axes.get_xaxis()
y_axis = ax_arr[row_num][col_num].axes.get_yaxis()
x_axis.get_label().set_visible(False)
y_axis.set_label_text('{} {}'.format('Absolute Error', units[feature]))
for tick in ax_arr[row_num][col_num].get_xticklabels():
tick.set_rotation(45)
for tick in ax_arr[row_num][col_num].get_yticklabels():
tick.set_rotation(45)
plt.tight_layout()
use_geomean = True
df_error = []
for feature in test_features:
data = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
data[technique] = list(pd.DataFrame(error_result[technique]['rel_err'])[feature].values)
if use_geomean:
df_tmp = pd.DataFrame(data[technique], columns=['err']).dropna()
df_error.append((feature, technique, gmean(abs(df_tmp['err']))))
else:
df_error.append((feature, technique, pd.DataFrame(data[technique]).mean().values[0]))
df_error = pd.DataFrame(df_error, columns=['Metric', 'Technique', 'Rel.Error'])
fig, ax_arr = plt.subplots(nrows=2, ncols=3, figsize=(9.5, 5))
for i, feature in enumerate(test_features):
row_num, col_num = int(i / 3), i % 3
df_error[df_error['Metric'] == feature].plot.bar(
title=feature, x='Technique', y='Rel.Error', edgecolor='k', linewidth=1.0,
legend=False, ax=ax_arr[row_num][col_num])
x_axis = ax_arr[row_num][col_num].axes.get_xaxis()
y_axis = ax_arr[row_num][col_num].axes.get_yaxis()
x_axis.get_label().set_visible(False)
y_axis.set_label_text('{}'.format('Relative Error'))
for tick in ax_arr[row_num][col_num].get_xticklabels():
tick.set_rotation(45)
for tick in ax_arr[row_num][col_num].get_yticklabels():
tick.set_rotation(45)
plt.tight_layout()
for technique in total_result:
training_time = total_result[technique]['training_time']
test_time = total_result[technique]['test_time']
mean_training_time = statistics.mean([v[1] for v in total_result[technique]['training_time']])
mean_test_time = sum([v[1] for v in total_result[technique]['test_time']])
test_n = sum([v[0] for v in total_result[technique]['test_time']])
print('[{:>4s}] Training Time: {:.3f}s, Prediction Throughput: {:.0f} / sec'.format(
technique, len(test_features) * mean_training_time, test_n / mean_test_time))
Let's use the model to solve a real-world problem in datacenter!
df = pd.read_csv('dataset/2017.socc.dataset.exclude.512.csv')
# Shuffle rows
df = df.sample(frac=1).reset_index(drop=True)
# Rename column names for readability
df = df.rename(columns={'qemu_tt': 'Total Time', 'qemu_dt': 'Downtime', 'qemu_td': 'Total Data',
'performance': 'Performance', 'used_cpu_src': 'SRC.CPU', 'used_mem_src': 'SRC.MEM'})
# Rename the column values to readable strings
df['capability'] = df['capability'].map({0: 'PRE', 1: 'THR', 2: 'DLTC', 3: 'DTC', 4: 'POST'})
df['workload_type'] = df['workload_type'].map({0: 'idle', 1: 'synthetic', 2: 'specweb', 3: 'oltpbench', 4: 'memcached', 5: 'dacapo', 6: 'parsec', 7: 'bzip', 8: 'mplayer'})
df['RPTR'] = df.apply(lambda row: get_RPTR(row), axis=1)
df['THR_benefit'] = df.apply(lambda row: get_THR_benefit(row), axis=1)
df['DLTC_benefit'] = df.apply(lambda row: get_DLTC_benefit(row), axis=1)
df['VM_nwss'] = df['VM_size'] - df['VM_wss']
df['VM_e_wss'] = df['VM_wss'] * df['VM_wse']
df['VM_e_nwss'] = df['VM_nwss'] * df['VM_nwse']
dataset = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
dataset[technique] = {}
dataset[technique]['X'] = df[df['capability'] == technique][training_features].copy(deep=True).astype(np.float64)
dataset[technique]['y'] = df[df['capability'] == technique][test_features].copy(deep=True).astype(np.float64)
model = 'svr'
regressor = {}
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
regressor[technique] = {}
print('{}'.format(technique))
regressor[technique] = {}
X = dataset[technique]['X']
y = dataset[technique]['y']
X_scaler = StandardScaler().fit(X)
y_scaler = StandardScaler().fit(y)
n_jobs = multiprocessing.cpu_count()
if model == 'linear':
regr = MultiOutputRegressor(LinearRegression(), n_jobs=n_jobs)
elif model == 'svr':
regr = MultiOutputRegressor(SVR(C=10.0), n_jobs=n_jobs)
regr.fit(X_scaler.transform(X), y_scaler.transform(y))
regressor[technique]['X_scaler'] = X_scaler
regressor[technique]['y_scaler'] = y_scaler
regressor[technique]['regr'] = regr
df_512 = pd.read_csv('dataset/2017.socc.dataset.512.migrations.csv')
# Rename column names for readability
df_512 = df_512.rename(columns={'qemu_tt': 'Total Time', 'qemu_dt': 'Downtime', 'qemu_td': 'Total Data',
'performance': 'Performance', 'used_cpu_src': 'SRC.CPU', 'used_mem_src': 'SRC.MEM'})
# Rename the column values to readable strings
df_512['capability'] = df_512['capability'].map({0: 'PRE', 1: 'THR', 2: 'DLTC', 3: 'DTC', 4: 'POST'})
df_512['RPTR'] = df_512.apply(lambda row: get_RPTR(row), axis=1)
df_512['THR_benefit'] = df_512.apply(lambda row: get_THR_benefit(row), axis=1)
df_512['DLTC_benefit'] = df_512.apply(lambda row: get_DLTC_benefit(row), axis=1)
df_512['VM_nwss'] = df_512['VM_size'] - df_512['VM_wss']
df_512['VM_e_wss'] = df_512['VM_wss'] * df_512['VM_wse']
df_512['VM_e_nwss'] = df_512['VM_nwss'] * df_512['VM_nwse']
df_512[:10]
predict_model = {}
for seed, row in df_512.groupby(['workload_seed']).mean().iterrows():
predict_model[seed] = {}
X = [row[f] for f in training_features]
for technique in ['PRE', 'THR', 'DLTC', 'DTC', 'POST']:
predict_model[seed][technique] = {}
regr = regressor[technique]['regr']
X_scaler = regressor[technique]['X_scaler']
y_scaler = regressor[technique]['y_scaler']
X_scaled = X_scaler.transform(np.array(X).reshape(1, -1))
y_prediction = y_scaler.inverse_transform(regr.predict(X_scaled))[0]
y_prediction = dict(zip(test_features, y_prediction))
predict_model[seed][technique].update(y_prediction)
predict_oracle = {}
for _, row in df_512.iterrows():
seed = row['workload_seed']
technique = row['capability']
if seed not in predict_oracle:
predict_oracle[seed] = {}
if technique not in predict_oracle[seed]:
predict_oracle[seed][technique] = {}
predict_oracle[seed][technique].update(dict(zip(test_features, [row[metric] for metric in test_features])))
print('One Case Test')
print(' [MODEL] Seed: {}, Technique: {}, Total Time: {}'.format(0, 'PRE', predict_model[0]['PRE']['Total Time']))
print('[ORACLE] Seed: {}, Technique: {}, Total Time: {}'.format(0, 'PRE', predict_oracle[0]['PRE']['Total Time']))
The ML Model-guided live migration technique selection shows comparable result against using Oracle.
selection = {}
total_time_sum = 0
for i in range(len(predict_model)):
total_time, technique = (sorted([(predict_model[i][technique]['Total Time'], technique) for technique in predict_model[i]])[0])
if technique not in selection:
selection[technique] = 1
else:
selection[technique] += 1
total_time_sum += predict_oracle[i][technique]['Total Time']
print('Selected Techniques: {}'.format(selection))
print('Mean Total Migration Time: {:.0f} ms'.format(total_time_sum / len(predict_model)))
selection = {}
total_time_sum = 0
for i in range(len(predict_oracle)):
total_time, technique = (sorted([(predict_oracle[i][technique]['Total Time'], technique) for technique in predict_oracle[i]])[0])
if technique not in selection:
selection[technique] = 1
else:
selection[technique] += 1
total_time_sum += total_time
print('Selected Techniques: {}'.format(selection))
print('Mean Total Migration Time: {:.0f} ms'.format(total_time_sum / len(predict_model)))